import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
df = pd.read_csv("summarize_text.csv")
df = df[['article', 'highlights']].dropna()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)  
    text = re.sub('"', '', text)  
    text = re.sub(r'[^a-zA-Z?.!,\d]', ' ', text)  
    text = re.sub(r'\s+', ' ', text) 
    return text

df['article'] = df['article'].apply(preprocess_text)
df['highlights'] = df['highlights'].apply(preprocess_text)
# Tokenizer for input text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(df['article'])
X_train = text_tokenizer.texts_to_sequences(df['article'])
max_article_length = pd.Series(X_train).map(len).max()+1
X_train_padded = pad_sequences(X_train, maxlen=max_article_length, padding='post')
print(X_train_padded.shape)
(9, 866)
df['highlights'] = df['highlights'].apply(lambda s: f"startofseq {s} endofseq")
summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(df['highlights'])
Y_train = summary_tokenizer.texts_to_sequences(df['highlights'])
max_highlights_length = pd.Series(Y_train).map(len).max()+1
Y_train_padded = pad_sequences(Y_train, maxlen=max_highlights_length, padding='post')
print(Y_train_padded.shape)
(9, 94)
max_highlights_length
94
# Vocabulary sizes
text_vocab_size = len(text_tokenizer.word_index) + 1
summary_vocab_size = len(summary_tokenizer.word_index) + 1
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units):
        super(AttentionLayer, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, encoder_outputs, decoder_outputs):
              
        decoder_expanded = tf.expand_dims(decoder_outputs, 1)  # (batch_size, 1, 93, 1024)     
        encoder_reshaped= (encoder_outputs)[:, :, None, :] # (batch_size, 866,1, 1024)
        # attention scores
        score = self.V(tf.nn.tanh(self.W1(encoder_reshaped) + self.W2(decoder_expanded)))  # (batch_size, 866, 93, 1)     

        # attention weight
        attention_weights = tf.nn.softmax(score, axis=1)

        #context vector
        context_vector = tf.reduce_sum(attention_weights * encoder_reshaped, axis=1) #(batch_size, 93, 1024)
        return context_vector, attention_weights
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, Dense, Input, Concatenate

embed_size=25
attention_units = 512

# encoder
encoder_inputs = Input(shape=(max_article_length,))
encoder_embedding = Embedding(text_vocab_size, embed_size, mask_zero=True)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(512, return_state=True, return_sequences=True))  
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# decder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(summary_vocab_size, embed_size, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(1024, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# attention_layer
attention_layer = AttentionLayer(units=attention_units)  # Pass required parameter
attention_outputs, attention_weights = attention_layer(encoder_outputs, decoder_outputs)
#combined layers
decoder_combined_context = Concatenate()([decoder_outputs, attention_outputs])

output_layer = Dense(summary_vocab_size, activation='softmax')
outputs = output_layer(decoder_combined_context)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()
D:\Anaconda\Lib\site-packages\keras\src\layers\layer.py:932: UserWarning: Layer 'attention_layer_2' (of type AttentionLayer) was passed an input with a mask attached to it. However, this layer does not support masking and will therefore destroy the mask information. Downstream layers will not see the mask.
  warnings.warn(
Model: "functional_3"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                   Output Shape                       Param #  Connected to               ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ input_layer_6 (InputLayer)    │ (None, 866)               │               0 │ -                          │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ embedding_4 (Embedding)       │ (None, 866, 25)           │          32,225 │ input_layer_6[0][0]        │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ not_equal_4 (NotEqual)        │ (None, 866)               │               0 │ input_layer_6[0][0]        │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ input_layer_7 (InputLayer)    │ (None, None)              │               0 │ -                          │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ bidirectional_2               │ [(None, 866, 1024),       │       2,203,648 │ embedding_4[0][0],         │
│ (Bidirectional)               │ (None, 512), (None, 512), │                 │ not_equal_4[0][0]          │
│                               │ (None, 512), (None, 512)] │                 │                            │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ embedding_5 (Embedding)       │ (None, None, 25)          │           6,775 │ input_layer_7[0][0]        │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ concatenate_8 (Concatenate)   │ (None, 1024)              │               0 │ bidirectional_2[0][1],     │
│                               │                           │                 │ bidirectional_2[0][3]      │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ concatenate_9 (Concatenate)   │ (None, 1024)              │               0 │ bidirectional_2[0][2],     │
│                               │                           │                 │ bidirectional_2[0][4]      │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ lstm_5 (LSTM)                 │ [(None, None, 1024),      │       4,300,800 │ embedding_5[0][0],         │
│                               │ (None, 1024), (None,      │                 │ concatenate_8[0][0],       │
│                               │ 1024)]                    │                 │ concatenate_9[0][0]        │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ attention_layer_2             │ [(None, None, 1024),      │       1,050,113 │ bidirectional_2[0][0],     │
│ (AttentionLayer)              │ (None, 866, None, 1)]     │                 │ lstm_5[0][0]               │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ concatenate_10 (Concatenate)  │ (None, None, 2048)        │               0 │ lstm_5[0][0],              │
│                               │                           │                 │ attention_layer_2[0][0]    │
├───────────────────────────────┼───────────────────────────┼─────────────────┼────────────────────────────┤
│ dense_10 (Dense)              │ (None, None, 271)         │         555,279 │ concatenate_10[0][0]       │
└───────────────────────────────┴───────────────────────────┴─────────────────┴────────────────────────────┘
 Total params: 8,148,840 (31.09 MB)
 Trainable params: 8,148,840 (31.09 MB)
 Non-trainable params: 0 (0.00 B)
# decoder input and output data
decoder_input_data = Y_train_padded[:, :-1]  
decoder_output_data = Y_train_padded[:, 1:]  
#decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_highlights_length, padding='post')
#decoder_output_data = pad_sequences(decoder_output_data, maxlen=max_highlights_length, padding='post')
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder output shape:", decoder_output_data.shape)
Decoder input shape: (9, 93)
Decoder output shape: (9, 93)
# Training
history = model.fit(
    [X_train_padded, decoder_input_data],  # Encoder input and decoder input
    decoder_output_data,                   # Decoder output
    epochs=1,
    validation_split=0.2
)
1/1 ━━━━━━━━━━━━━━━━━━━━ 138s 138s/step - accuracy: 0.0000e+00 - loss: 5.6012 - val_accuracy: 0.6022 - val_loss: 5.5283

inference model

encoder_model = tf.keras.models.Model(encoder_inputs, [encoder_outputs, state_h, state_c])
decoder_state_input_h = tf.keras.layers.Input(shape=(1024,))
decoder_state_input_c = tf.keras.layers.Input(shape=(1024,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)

# attention layerr
attention_outputs, attention_weights = attention_layer(encoder_outputs, decoder_outputs)

# Combined
decoder_combined_context = tf.keras.layers.concatenate([decoder_outputs, attention_outputs])

decoder_outputs = output_layer(decoder_combined_context)
decoder_model = tf.keras.models.Model(
    [decoder_inputs] + decoder_states_inputs + [encoder_outputs],
    [decoder_outputs, state_h, state_c]
)
def summarize_text(input_text):
    # Preprocess the input text
    input_seq = text_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_article_length, padding='post')

    # Encode the input sequence to get the encoder outputs and states
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq)

    # Initialize the decoder input with the start token
    target_seq = np.zeros((1, 1))  # Single timestep for the start token
    target_seq[0, 0] = summary_tokenizer.word_index['startofseq']  # Use the correct start token

    # Generate the summary
    summary_output = ''
    for _ in range(max_highlights_length):
        # Predict the next token using the decoder model
        output_tokens, h, c = decoder_model.predict([target_seq] + [state_h, state_c] + [encoder_outputs])

        # Get the predicted token
        predicted_token = np.argmax(output_tokens[0, -1, :])
        word = summary_tokenizer.index_word.get(predicted_token, '')

        # Stop if the end token is predicted or if the word is not found
        if word == 'endofseq' or word == '':
            break

        # Append the word to the summary
        summary_output += ' ' + word

        # Update the target sequence and states for the next timestep
        target_seq = np.zeros((1, 1))  # Reset for the next timestep
        target_seq[0, 0] = predicted_token
        state_h, state_c = h, c

    return summary_output.strip()
# Example input text
text_sample = "The stock market saw a significant drop today due to global inflation fears and economic downturn signals."

# Generate summary
summary = summarize_text(text_sample)
print("Generated Summary:", summary)
1/1 ━━━━━━━━━━━━━━━━━━━━ 1s 1s/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 1s 550ms/step
Generated Summary: